In [1]:
import os
import numpy as np
import pandas as pd

# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.losses import BinaryCrossentropy, categorical_crossentropy
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD

# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
%matplotlib inline

## seaborn
import seaborn as sns

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")

Image Classification

This article is based on the TensorFlow Image Classification article where we demonstrate image classification using TensorFlow. The dataset that we use here is a filtered version of Dogs vs Cats dataset from Kaggle.

Dogs vs. Cats Dataset

Downloading Data

In [2]:
def Get_Data(_URL, Remove = True):
    # The dataset URL
    File = _URL.split('/')[-1]
    Full_Name =  os.path.join(os.getcwd(), File)
    # Download the dataset file from the URL
    path_to_zip = tf.keras.utils.get_file(fname =Full_Name, origin=_URL, extract=True, cache_dir = os.getcwd())
    PATH = os.path.join(os.path.dirname(path_to_zip), 'datasets', File.split('.')[0])
    # Deleting the zip file
    if Remove:
        os.remove(File)
    return PATH
    #-----------------------------------------------------------------
    
_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
PATH = Get_Data(_URL)

Dataset Directory Structure

In [3]:
def Path_Tree(PATH):
    sep = ' ' * 3
    title = PATH.split('\\')[-1]
    print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1) + Style.RESET_ALL)
    print(Back.BLACK + Fore.CYAN + Style.NORMAL + title+':'+ Style.RESET_ALL)
    print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1)+ Style.RESET_ALL)
    for entry in os.listdir(PATH):
        sub = os.path.join(PATH, entry)
        if os.path.isdir(sub):
            print('└──',Back.CYAN + Fore.BLACK + Style.NORMAL + entry+':'+ Style.RESET_ALL)
            for entry1 in os.listdir(sub):
                sub1 = os.path.join(sub, entry1)
                if os.path.isdir(sub):
                    print(sep + '└──',Back.MAGENTA + Fore.BLACK + Style.NORMAL + entry1+':'+ Style.RESET_ALL)
                    List = os.listdir(sub1)
                    print(2* sep, Back.YELLOW + Fore.BLACK + Style.NORMAL +
                          '%i %s files' % (len(List), List[0].split('.')[-1].upper()) + Style.RESET_ALL)
                    print(2* sep, ', '.join(List[:5]) + ', ...')
    #-----------------------------------------------------------------
    
Path_Tree(PATH)
=======================
cats_and_dogs_filtered:
=======================
└── train:
   └── cats:
       1000 JPG files
       cat.0.jpg, cat.1.jpg, cat.10.jpg, cat.100.jpg, cat.101.jpg, ...
   └── dogs:
       1000 JPG files
       dog.0.jpg, dog.1.jpg, dog.10.jpg, dog.100.jpg, dog.101.jpg, ...
└── validation:
   └── cats:
       500 JPG files
       cat.2000.jpg, cat.2001.jpg, cat.2002.jpg, cat.2003.jpg, cat.2004.jpg, ...
   └── dogs:
       500 JPG files
       dog.2000.jpg, dog.2001.jpg, dog.2002.jpg, dog.2003.jpg, dog.2004.jpg, ...
In [4]:
def Data_Info(PATH):
    Set = [];
    Subset = [];
    Size = [];
    DataDirs = {};
    for entry in os.listdir(PATH):
        sub = os.path.join(PATH, entry)
        if os.path.isdir(sub):
            DataDirs[entry] = sub
            for entry1 in os.listdir(sub):
                sub1 = os.path.join(sub, entry1)
                if os.path.isdir(sub):
                    DataDirs[entry + '_' +entry1] = sub1
                    Set.append(entry.title())
                    Subset.append(entry1.title())
                    Size.append(len(os.listdir(sub1)))

    DataFrame_Info = pd.DataFrame({'Set': Set, 'Subset': Subset, 'Size':Size})
    display(DataFrame_Info.set_index(['Set' , 'Subset']).T)
    return DataFrame_Info, DataDirs
    #-----------------------------------------------------------------
    
DataFrame_Info, DataDirs = Data_Info(PATH)
Set Train Validation
Subset Cats Dogs Cats Dogs
Size 1000 1000 500 500

Preprocessing

In [5]:
batch_size = 128
epochs = 15
Img_Height = 150
Img_Width = 150

Train Data Image DataGenerator

In [6]:
image_gen_train = ImageDataGenerator(
    # Rescaling the tensors from values between 0 and 255 to values between 0 and 1
    rescale=1./255,
    # Applying 45 degrees of rotation randomly
    rotation_range=45,
    # Range for random horizontal shifts.
    width_shift_range=.15,
    # Range for random vertical shifts.
    height_shift_range=.15,
    # applying random horizontal flip augmentation
    horizontal_flip=True,
    # Applying a zoom augmentation to the dataset to zoom images up to 50%
    zoom_range=0.5
    )

print(Back.WHITE + Fore.BLACK + Style.NORMAL + 'Train Data:'+ Style.RESET_ALL)
# flow_from_directory method load images from the disk
train_data_gen = image_gen_train.flow_from_directory(batch_size=batch_size,
                                                     directory=DataDirs['train'],
                                                     shuffle=True,
                                                     target_size=(Img_Height, Img_Width),
                                                     class_mode='binary')
Train Data:
Found 2000 images belonging to 2 classes.
In [7]:
def plotImages(images_arr, s = 3.4, Title = False):
    fig, axes = plt.subplots(1, len(images_arr), figsize=(s* len(images_arr),s))
    axes = axes.flatten()
    font = FontProperties()
    font.set_weight('bold')
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    _ = fig.tight_layout()
    _ = fig.subplots_adjust(wspace= 5e-3)
    if Title:
        font = FontProperties()
        font.set_weight('bold')
        _ = fig.suptitle(Title, y = 1.05, fontsize = 18)

    
sample_images, _ = next(train_data_gen)
plotImages(sample_images[:4], Title = 'Four Random Pictures from the Train Sample')

augmented_images = [train_data_gen[0][0][0] for i in range(5)]
plotImages(augmented_images, Title = 'Original and Augmented Pictures', s = 3)

Validation Data Image DataGenerator

In [8]:
validation_image_generator = ImageDataGenerator(rescale=1./255)

print(Back.WHITE + Fore.BLACK + Style.NORMAL + 'Validation Data:'+ Style.RESET_ALL)
val_data_gen = validation_image_generator.flow_from_directory(batch_size = batch_size,
                                                              directory = DataDirs['validation'],
                                                              target_size = (Img_Height, Img_Width),
                                                              class_mode = 'binary')

sample_images, _ = next(val_data_gen)
plotImages(sample_images[:4], Title = 'Four Random Pictures from the Train Sample')
Validation Data:
Found 1000 images belonging to 2 classes.

Modeling

We use Keras Sequential model for creating a model.

In [9]:
model = Sequential(name = 'CNN')
model.add(Conv2D(16, 3, padding='same', activation='relu', input_shape=(Img_Height, Img_Width ,3)))
model.add(MaxPooling2D())
# regularization
model.add(Dropout(0.2))

model.add(Conv2D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling2D())
model.add(Conv2D(64, 3, padding='same', activation='relu'))
model.add(MaxPooling2D())
# regularization
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(1))

model.summary()
plot_model(model, show_shapes=True, show_layer_names=False, expand_nested = True, rankdir = 'TB')
Model: "CNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 150, 150, 16)      448       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 75, 75, 16)        0         
_________________________________________________________________
dropout (Dropout)            (None, 75, 75, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 75, 75, 32)        4640      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 37, 37, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 37, 37, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 18, 18, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 18, 18, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 20736)             0         
_________________________________________________________________
dense (Dense)                (None, 512)               10617344  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 513       
=================================================================
Total params: 10,641,441
Trainable params: 10,641,441
Non-trainable params: 0
_________________________________________________________________
Out[9]:

Compiling and fitting the model

In [10]:
model.compile(optimizer = 'adam', loss = BinaryCrossentropy(from_logits=True), metrics = ['accuracy'])
history = model.fit_generator(train_data_gen,
              steps_per_epoch = DataFrame_Info.loc[DataFrame_Info['Set'] == 'Train','Size'].sum() // batch_size,
              epochs = epochs,
              validation_data = val_data_gen,
              validation_steps = DataFrame_Info.loc[DataFrame_Info['Set'] == 'Validation','Size'].sum() // batch_size)
clear_output()
In [11]:
def Table_History(history):
    Table = pd.DataFrame(pd.DataFrame(history.history).values,
                     columns = pd.MultiIndex.from_product([['Train', 'Validation'], ['Loss', 'Accuracy']]))
    display(Table.style.background_gradient(subset= [('Train', 'Accuracy'), ('Validation', 'Accuracy')], cmap='BuGn')\
            .background_gradient(subset= [( 'Train','Loss'), ('Validation', 'Loss')], cmap='Wistia').set_precision(4))
    # -------------------------------------------------------------------------------------------
    
def Plot_History(history, L = [0, 1], R = [0, 1]):
    fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.05, column_widths=[0.5, 0.5],
                    specs=[[{"type": "scatter"},{"type": "scatter"}]],
                    subplot_titles=('Loss', 'Accuracy'))
    Temp = pd.DataFrame(history.history)
    IT = np.arange(1, Temp.index.max()+2)
    fig.add_trace(go.Scatter(x= IT,
                             y= Temp['loss'].values,
                             line=dict(color='DeepPink', width= 1.5), name = 'Train'), 1, 1)
    fig.add_trace(go.Scatter(x= IT,
                             y= Temp['val_loss'].values,
                             line=dict(color='MidnightBlue', width= 1.5), name = 'Validation'), 1, 1)
    fig.update_traces(showlegend = False, row=1, col=1)
    fig.add_trace(go.Scatter(x= IT,
                             y= Temp['accuracy'].values,
                             line=dict(color='DeepPink', width= 1.5), name = 'Train'), 1, 2)
    fig.add_trace(go.Scatter(x= IT,
                             y= Temp['val_accuracy'].values,
                             line=dict(color='MidnightBlue', width= 1.5), name = 'Validation'), 1, 2)

    fig.update_layout(plot_bgcolor= 'white')
    fig.update_xaxes(range=[IT.min()-1, IT.max()+1],
                     zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray',
                     showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
    fig.update_yaxes(range=L,
                     showgrid=True, gridwidth=1, gridcolor='Lightgray',
                     zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray',
                     showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
    fig.update_xaxes(range=[IT.min()-1, IT.max()+1],
                     zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray',
                     showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=2)
    fig.update_yaxes(range=R,
                     showgrid=True, gridwidth=1, gridcolor='Lightgray',
                     zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray',
                     showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=2)
    fig.show()
    # -------------------------------------------------------------------------------------------
In [12]:
Table_History(history)    
Plot_History(history, L = [0.5, 1], R = [0.4, .8])
Train Validation
Loss Accuracy Loss Accuracy
0 1.0909 0.5011 0.6960 0.4888
1 0.6935 0.5032 0.6901 0.4888
2 0.6925 0.4979 0.6918 0.4888
3 0.6913 0.5032 0.6915 0.4888
4 0.6912 0.5011 0.6954 0.4888
5 0.6882 0.4989 0.6863 0.4888
6 0.6805 0.5128 0.6799 0.4944
7 0.6728 0.5299 0.6702 0.4989
8 0.6683 0.5406 0.6696 0.5089
9 0.6625 0.5673 0.6698 0.5033
10 0.6594 0.5518 0.6546 0.5134
11 0.6437 0.5887 0.6432 0.5246
12 0.6198 0.6421 0.6782 0.5335
13 0.6133 0.6368 0.6411 0.5424
14 0.6063 0.6416 0.6444 0.5458